duplicates(10 Marks)
regression problem. The order of this listing corresponds to the order of numerals along the rows of the database.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
df = pd.read_csv('concrete.csv')
df.head(10)
df.shape
df.dtypes
df.describe().transpose()
df.skew()
df.isna().sum()
(df < 0.0).sum()
(df == 0.0).sum()
print(df[df.duplicated() == True].shape)
df[df.duplicated() == True]
df_copy = df.copy()
df.drop_duplicates(inplace=True)
print(df.shape)
df[df.duplicated() == True]
fig, axes = plt.subplots(3, 3, figsize=(18, 10))
sns.distplot(df['cement'], ax=axes[0,0])
sns.distplot(df['slag'],ax=axes[0,1])
sns.distplot(df['ash'], ax=axes[0,2])
sns.distplot(df['water'], ax=axes[1,0])
sns.distplot(df['superplastic'], ax=axes[1,1])
sns.distplot(df['coarseagg'], ax=axes[1,2])
sns.distplot(df['fineagg'], ax=axes[2,0])
sns.distplot(df['age'], ax=axes[2,1])
sns.distplot(df['strength'], ax=axes[2,2])
fig, axes = plt.subplots(3, 3, figsize=(18, 10))
sns.boxplot(df['cement'], ax=axes[0,0])
sns.boxplot(df['slag'],ax=axes[0,1])
sns.boxplot(df['ash'], ax=axes[0,2])
sns.boxplot(df['water'], ax=axes[1,0])
sns.boxplot(df['superplastic'], ax=axes[1,1])
sns.boxplot(df['coarseagg'], ax=axes[1,2])
sns.boxplot(df['fineagg'], ax=axes[2,0])
sns.boxplot(df['age'], ax=axes[2,1])
sns.boxplot(df['strength'], ax=axes[2,2])
corr = df.corr()
sns.heatmap(corr, annot=True, fmt='.2f')
corr
sns.pairplot(df)
df.nunique()
sns.lineplot(data=df, x='cement', y='strength')
sns.lineplot(data=df, x='water', y='strength')
sns.lineplot(data=df, x='slag', y='strength')
sns.lineplot(data=df, x='ash', y='strength')
sns.lineplot(data=df, x='coarseagg', y='strength')
sns.lineplot(data=df, x='fineagg', y='strength')
sns.lineplot(data=df, x='age', y='strength')
sns.lineplot(data=df, x='superplastic', y='strength')
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
modelTuples = (('LR', LinearRegression()),
('SVR', SVR()),
('RFR', RandomForestRegressor(random_state=1)),
('BAGR', BaggingRegressor(random_state=1)),
('XGR', XGBRegressor(random_state=1))
)
val_scores = dict()
for model in modelTuples:
#print(model)
(n,m) = model
pipe = Pipeline([('scaler', MinMaxScaler()), (n,m)])
val_scores[n] = cross_val_score(pipe, df.drop('strength', axis=1), df['strength'], cv=10, n_jobs=None)
print(val_scores)
comparison_df = pd.DataFrame.from_dict(val_scores, orient='columns')
comparison_df.describe().transpose()
from sklearn.model_selection import train_test_split
X = df.drop('strength', axis=1)
Y = df.strength
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
score = dict()
for model in modelTuples:
#print(model)
(n,m) = model
pipe = Pipeline([('scaler', MinMaxScaler()), (n,m)])
pipe.fit(X_train, Y_train)
score[n] = pipe.score(X_test, Y_test)
comp_test_data = pd.DataFrame.from_dict(score, orient='index')
comp_test_data
final_scores = dict()
RandomForestRegressor().get_params()
from sklearn.model_selection import GridSearchCV
model = Pipeline([('scaler', MinMaxScaler()), ('RFR', RandomForestRegressor(random_state=1))])
param_grid = {'RFR__n_estimators': [50, 100, 200, 300],
'RFR__max_depth': [3, 6, 9],
'RFR__max_features': [3, 5, 8],
'RFR__bootstrap' : [True, False]
}
gridmodel = GridSearchCV(estimator=model, param_grid=param_grid, scoring='r2', cv=5, n_jobs=None)
gridmodel.fit(X_train, Y_train)
print(gridmodel.best_score_)
print(gridmodel.best_params_)
from sklearn.pipeline import make_pipeline
final_model = RandomForestRegressor(bootstrap=False, max_depth=9, max_features=5, n_estimators=200, random_state=1)
X_train = MinMaxScaler().fit_transform(X_train)
final_model.fit(X_train, Y_train)
feature_imps = final_model.feature_importances_
feats = pd.DataFrame({'feature': X.columns, 'Importance': feature_imps})
feats.sort_values('Importance', ascending=False)
X_test = MinMaxScaler().fit_transform(X_test)
train_score = final_model.score(X_train, Y_train)
test_score = final_model.score(X_test, Y_test)
print("Training score: {} Test score: {}".format(train_score, test_score))
final_scores['RFR'] = [train_score, test_score]
XGBRegressor().get_params()
model = Pipeline([('scaler', MinMaxScaler()), ('XGB', XGBRegressor(random_state=1))])
param_grid = {'XGB__n_estimators': [50, 100, 200, 300],
'XGB__max_depth': [3, 6, 9],
'XGB__learning_rate': [0.01, 0.1, 1.0]
}
gridmodel = GridSearchCV(estimator=model, param_grid=param_grid, scoring='r2', cv=5, n_jobs=None)
gridmodel.fit(X_train, Y_train)
print(gridmodel.best_score_)
print(gridmodel.best_params_)
final_model = XGBRegressor(learning_rate=0.1, max_depth=6, n_estimators=300, random_state=1)
X_train = MinMaxScaler().fit_transform(X_train)
final_model.fit(X_train, Y_train)
feature_imps = final_model.feature_importances_
feats = pd.DataFrame({'feature': X.columns, 'Importance': feature_imps})
feats.sort_values('Importance', ascending=False)
X_test = MinMaxScaler().fit_transform(X_test)
train_score = final_model.score(X_train, Y_train)
test_score = final_model.score(X_test, Y_test)
print("Training score: {} Test score: {}".format(train_score, test_score))
final_scores['XGB'] = [train_score, test_score]
comp_df = pd.DataFrame.from_dict(final_scores, orient='index', columns=['Training score', 'Test score'])
comp_df
final_model = RandomForestRegressor(bootstrap=False, max_depth=9, max_features=5, n_estimators=200, random_state=1)
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X1_train = poly.fit_transform(X_train)
X1_test = poly.fit_transform(X_test)
X1_train.shape, X1_test.shape
final_model.fit(X1_train, Y_train)
train_score = final_model.score(X1_train, Y_train)
test_score = final_model.score(X1_test, Y_test)
print("Training score: {} Test score: {}".format(train_score, test_score))
final_scores['RFR with Feature engineering'] = [train_score, test_score]
from sklearn.preprocessing import PowerTransformer
X = df.drop('strength', axis=1)
Y = df.strength
estimator = RandomForestRegressor(bootstrap=False, max_depth=9, max_features=5, n_estimators=200, random_state=1)
pipe = Pipeline([('transform', PowerTransformer(standardize=True, method='yeo-johnson')), ('model', estimator)])
val_score = cross_val_score(pipe, X, Y, cv=10, n_jobs=None)
print(val_score)
print("Mean: {} Std: {}".format(val_score.mean(), val_score.std()))
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
pipe.fit(X_train, Y_train)
train_score = pipe.score(X_train, Y_train)
test_score = pipe.score(X_test, Y_test)
print(train_score, test_score)
final_scores['RFR with PowerTransform'] = [train_score, test_score]
comp_df = pd.DataFrame.from_dict(final_scores, orient='index', columns=['Training score', 'Test score'])
comp_df
transformer = PowerTransformer(standardize=True, method='yeo-johnson')
transformer.fit(df)
data = transformer.transform(df)
df1 = pd.DataFrame(data, columns=df.columns)
df1.head(10)
fig, axes = plt.subplots(3, 3, figsize=(18, 10))
sns.distplot(df1['cement'], ax=axes[0,0])
sns.distplot(df1['slag'],ax=axes[0,1])
sns.distplot(df1['ash'], ax=axes[0,2])
sns.distplot(df1['water'], ax=axes[1,0])
sns.distplot(df1['superplastic'], ax=axes[1,1])
sns.distplot(df1['coarseagg'], ax=axes[1,2])
sns.distplot(df1['fineagg'], ax=axes[2,0])
sns.distplot(df1['age'], ax=axes[2,1])
sns.distplot(df1['strength'], ax=axes[2,2])